/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - Department of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is TestIndexers.java.
*
* The Original Code is Copyright (C) 2004-2010 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*
*/
package org.terrier.indexing;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import gnu.trove.TObjectIntHashMap;
import java.io.ByteArrayInputStream;
import java.util.HashMap;
import java.util.Map;
import org.junit.Before;
import org.junit.Test;
import org.terrier.indexing.tokenisation.EnglishTokeniser;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.BitPostingIndexInputStream;
import org.terrier.structures.DirectIndex;
import org.terrier.structures.DocumentIndex;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.Index;
import org.terrier.structures.InvertedIndex;
import org.terrier.structures.Lexicon;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.MetaIndex;
import org.terrier.structures.postings.FieldIterablePosting;
import org.terrier.structures.postings.FieldPosting;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.tests.ApplicationSetupBasedTest;
import org.terrier.utility.ApplicationSetup;
//TODO: does not check block positions
public class TestIndexers extends ApplicationSetupBasedTest {
@Before public void setIndexerProperties() {
ApplicationSetup.setProperty("indexer.meta.forward.keys", "filename");
ApplicationSetup.setProperty("indexer.meta.reverse.keys", "");
ApplicationSetup.setProperty("termpipelines", "");
}
@SuppressWarnings("unchecked")
protected void testIndexer(Indexer indexer, boolean directExpected, boolean fieldsExpected) throws Exception {
Map<String,String> doc1Props = new HashMap<String,String>();doc1Props.put("filename", "doc1");
Map<String,String> doc2Props = new HashMap<String,String>();doc2Props.put("filename", "doc2");
Document[] sourceDocs = !fieldsExpected ?
new Document[]{
new FileDocument("doc1", new ByteArrayInputStream("cats dogs horses".getBytes()), new EnglishTokeniser()),
new FileDocument("doc2", new ByteArrayInputStream("chicken cats chicken chicken".getBytes()), new EnglishTokeniser())
}
: new Document[]{
new TaggedDocument(new ByteArrayInputStream("<title>cats</title> dogs horses".getBytes()), doc1Props, new EnglishTokeniser()),
new TaggedDocument(new ByteArrayInputStream("<title>chicken</title> cats chicken chicken".getBytes()), doc2Props, new EnglishTokeniser())
};
int[] doclens = new int[]{3, 4};
// INVERTED ----------
// [num_terms][num_posts]
int[][] invIds = new int[4][];
// [num_terms][num_posts]
int[][] invTfs = new int[4][];
// [num_terms][num_fields][num_posts]
int[][][] invFfs = new int[4][2][];
// DIRECT ----------
Map<String,int[]>[] dirFfs = new Map[2];
TObjectIntHashMap<String>dirTfs[] = new TObjectIntHashMap[2];
String[] termStrings = new String[] { "cats", "chicken", "dogs", "horses" };
// populate inverted
// 0 (0,1) // dogs
// "dogs" occur in docid 0
invIds[2] = new int[] { 0 };
// "dogs" has TF 1 in docid 0
invTfs[2] = new int[] { 1 };
// "dogs" has (TF_TITLE=0,TF_ELSE=1) for docid 0
invFfs[2] = new int[][] { {0,1} };
// 1 (0,1) // horses
invIds[3] = new int[] { 0 };
invTfs[3] = new int[] { 1 };
invFfs[3] = new int[][] { {0,1} };
// 2 (0,1) (1,1) // cats
// "cats" occur in docids 0 and 1
invIds[0] = new int[] { 0, 1 };
// "cats" has TF 1 in docid 0, 1 in docid 1
invTfs[0] = new int[] { 1, 1 };
// "cats" has (TF_TITLE=1,TF_ELSE=0) for docid 0, (TF_TITLE=0,TF_ELSE=1) for docid 1
invFfs[0] = new int[][] { {1,0}, {0,1} };
// 3 (1,3) // chicken
invIds[1] = new int[] { 1 };
invTfs[1] = new int[] { 3 };
invFfs[1] = new int[][] { {1,2} };
// populate direct
// 0 (0,1) (1,1) (2,1) // doc1
// 0=dogs, 1=horses, 2=cats
// <title>cats</title> dogs horses
dirTfs[0] = new TObjectIntHashMap<String>();
dirTfs[0].put("cats", 1);
dirTfs[0].put("dogs", 1);
dirTfs[0].put("horses", 1);
dirFfs[0] = new HashMap<String,int[]>();
dirFfs[0].put("cats", new int[]{1,0});
dirFfs[0].put("dogs", new int[]{0,1});
dirFfs[0].put("horses", new int[]{0,1});
dirTfs[1] = new TObjectIntHashMap<String>();
dirTfs[1].put("cats", 1);
dirTfs[1].put("chicken", 3);
dirFfs[1] = new HashMap<String,int[]>();
dirFfs[1].put("cats", new int[]{0,1});
dirFfs[1].put("chicken", new int[]{1,2});
Collection col = new CollectionDocumentList(sourceDocs, "filename");
indexer.createDirectIndex(new Collection[]{col});
indexer.createInvertedIndex();
Index index = !fieldsExpected ?
Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)
: Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, "fields");
assertNotNull(index);
MetaIndex meta = index.getMetaIndex();
assertNotNull(meta);
assertEquals("doc1", index.getMetaIndex().getItem("filename", 0));
assertEquals("doc2", index.getMetaIndex().getItem("filename", 1));
IterablePosting ip = null;
BitPostingIndexInputStream bpiis = null;
/** INVERTED FILE */
Lexicon<String> lexicon = index.getLexicon();
/**
* Test {@link IterablePosting} entries from a {@link InvertedIndex}
*/
InvertedIndex invertedIndex = index.getInvertedIndex();
assertNotNull(invertedIndex);
// for each term
for (int t = 0; t < termStrings.length; t++) {
LexiconEntry le = lexicon.getLexiconEntry(termStrings[t]);
assertNotNull(le);
ip = invertedIndex.getPostings((BitIndexPointer) le);
// for each document
int d = 0;
while (ip.next() != IterablePosting.EOL) {
assertEquals(invIds[t][d], ip.getId());
assertEquals(invTfs[t][d], ip.getFrequency());
assertEquals(doclens[invIds[t][d]], ip.getDocumentLength());
if (fieldsExpected) {
assertEquals(2, invFfs[t][d].length);
for (int f = 0; f < 2; f++) {
assertEquals(invFfs[t][d][f], ((FieldIterablePosting) ip).getFieldFrequencies()[f]);
}
}
d++;
}
ip.close();
}
// post-check
assertEquals(IterablePosting.EOL, ip.next());
/**
* Test {@link IterablePosting} entries from a {@link InvertedIndexInputStream}
*/
bpiis = (BitPostingIndexInputStream) index.getIndexStructureInputStream("inverted");
assertNotNull(bpiis);
// for each term
for (int t = 0; t < invIds.length; t++) {
assertTrue(bpiis.hasNext());
ip = bpiis.next();
assertNotNull(ip);
// for each document
int d = 0;
while (ip.next() != IterablePosting.EOL) {
assertEquals(invIds[t][d], ip.getId());
assertEquals(invTfs[t][d], ip.getFrequency());
assertEquals(doclens[invIds[t][d]], ip.getDocumentLength());
if (fieldsExpected) {
assertEquals(2, invFfs[t][d].length);
for (int f = 0; f < 2; f++) {
assertEquals(invFfs[t][d][f], ((FieldIterablePosting) ip).getFieldFrequencies()[f]);
}
}
d++;
}
}
// post-check
assertFalse(bpiis.hasNext());
/**
* Test posting array entries from a {@link InvertedIndex}
*/
// for each term
for (int t = 0; t < termStrings.length; t++) {
LexiconEntry le = lexicon.getLexiconEntry(termStrings[t]);
assertNotNull(le);
int[][] documents = invertedIndex.getDocuments(le);
if (!fieldsExpected) {
assertTrue(documents.length >= 2);
}
else {
// array should have length at least 4: 1 for the id, 1 for the
// frequency, 2 for the fields (optionally more for the blocks)
assertTrue(documents.length >= 4);
}
// check number of terms
assertEquals(invIds[t].length, documents[0].length);
assertEquals(invTfs[t].length, documents[1].length);
// for each document
for (int d = 0; d < documents[0].length; d++) {
// test document id
assertEquals(invIds[t][d], documents[0][d]);
// test document frequency
assertEquals(invTfs[t][d], documents[1][d]);
if (fieldsExpected) {
// test number of indexed fields
assertEquals(2, invFfs[t][d].length);
// test field frequency
for (int f = 0; f < 2; f++) {
assertEquals(invFfs[t][d][f], documents[2+f][d]);
}
}
}
}
/** DIRECT FILE */
if (directExpected) {
DocumentIndex documentIndex = index.getDocumentIndex();
/**
* Test {@link IterablePosting} entries from a {@link DirectIndex}
*/
DirectIndex directIndex = index.getDirectIndex();
assertNotNull(directIndex);
// for each document
for (int d = 0; d < dirTfs.length; d++) {
DocumentIndexEntry de = documentIndex.getDocumentEntry(d);
assertNotNull(de);
ip = directIndex.getPostings((BitIndexPointer) de);
FieldPosting fp = fieldsExpected ? (FieldPosting)ip : null;
// for each term
int t = 0;
int countFoundTerms = 0;
while (ip.next() != IterablePosting.EOL) {
int termid = ip.getId();
assertTrue(termid >= 0);
String term = lexicon.getLexiconEntry(termid).getKey();
assertNotNull(term);
countFoundTerms++;
assertTrue(dirTfs[d].containsKey(term));
assertEquals(dirTfs[d].get(term), ip.getFrequency());
assertEquals(doclens[d], ip.getDocumentLength());
if (fieldsExpected) {
assertEquals(2, fp.getFieldFrequencies().length);
for (int f = 0; f < 2; f++) {
assertEquals(dirFfs[d].get(term)[f], fp.getFieldFrequencies()[f]);
}
}
t++;
}
assertEquals(dirTfs[d].size() ,countFoundTerms);
ip.close();
}
// post-check
assertEquals(IterablePosting.EOL, ip.next());
/**
* Test {@link IterablePosting} entries from a {@link DirectIndexInputStream}
*/
bpiis = (BitPostingIndexInputStream) index.getIndexStructureInputStream("direct");
assertNotNull(bpiis);
// for each document
for (int d = 0; d < dirTfs.length; d++) {
assertTrue(bpiis.hasNext());
ip = bpiis.next();
assertNotNull(ip);
FieldPosting fp = fieldsExpected ? (FieldPosting)ip : null;
// for each term
int t = 0;
int countFoundTerms = 0;
while (ip.next() != IterablePosting.EOL) {
int termid = ip.getId();
assertTrue(termid >= 0);
String term = lexicon.getLexiconEntry(termid).getKey();
assertNotNull(term);
countFoundTerms++;
assertTrue(dirTfs[d].containsKey(term));
assertEquals(dirTfs[d].get(term), ip.getFrequency());
assertEquals(doclens[d], ip.getDocumentLength());
if (fieldsExpected) {
assertEquals(2, fp.getFieldFrequencies().length);
for (int f = 0; f < 2; f++) {
assertEquals(dirFfs[d].get(term)[f], fp.getFieldFrequencies()[f]);
}
}
t++;
}
assertEquals(dirTfs[d].size() ,countFoundTerms);
}
// post-check
assertFalse(bpiis.hasNext());
/**
* Test posting array entries from a {@link DirectIndex}
*/
// for each document
for (int d = 0; d < dirTfs.length; d++) {
DocumentIndexEntry de = documentIndex.getDocumentEntry(d);
assertNotNull(de);
int[][] terms = directIndex.getTerms(de);
if (!fieldsExpected) {
assertTrue(terms.length >= 2);
}
else {
// array should have length at least 4: 1 for the id, 1 for the
// frequency, 2 for the fields (optionally more for the blocks)
assertTrue(terms.length >= 4);
}
// check number of terms
assertEquals(dirTfs[d].size(), terms[0].length);
assertEquals(dirTfs[d].size(), terms[1].length);
// for each term
for (int t = 0; t < terms[0].length; t++) {
// test term id
String term = lexicon.getLexiconEntry(terms[0][t]).getKey();
assertTrue(dirTfs[d].containsKey(term));
assertEquals(dirTfs[d].get(term), terms[1][t]);
if (fieldsExpected) {
// test field frequency
for (int f = 0; f < 2; f++) {
assertEquals(dirFfs[d].get(term)[f], terms[2+f][t]);
}
}
}
}
}
}
@Test
public void testBasicNoFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "");
testIndexer(new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX), true, false);
}
@Test
public void testBasicFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "TITLE,ELSE");
testIndexer(new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, "fields"), true, true);
}
@Test
public void testBlockNoFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "");
testIndexer(new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX), true, false);
}
@Test
public void testBlockFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "TITLE,ELSE");
testIndexer(new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, "fields"), true, true);
}
@Test
public void testBasicSPNoFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "");
testIndexer(new BasicSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX), false, false);
}
@Test
public void testBasicSPFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "TITLE,ELSE");
testIndexer(new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, "fields"), false, true);
}
@Test
public void testBlockSPNoFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "");
testIndexer(new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX), false, false);
}
@Test
public void testBlockSPFields() throws Exception
{
ApplicationSetup.setProperty("FieldTags.process", "TITLE,ELSE");
testIndexer(new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, "fields"), false, true);
}
}